{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_kg_hide-output": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import lightgbm as lgb\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "import datetime\n",
    "from meteocalc import feels_like, Temp\n",
    "from sklearn import metrics\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../Large_output/train_1.074.csv')\n",
    "train_df['meter_reading'] = np.expm1(train_df[\"meter_reading\"])\n",
    "train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading']\\\n",
    "=train_df.loc[(train_df['site_id']==0) & (train_df['meter']==0),'meter_reading'].mul(0.2931)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "target = np.log1p(train_df[\"meter_reading\"])\n",
    "features = train_df[['building_id',\\\n",
    " 'site_id',\\\n",
    " 'primary_use',\\\n",
    " 'meter',\\\n",
    " 'dayofweek',\\\n",
    " 'square_feet',\\\n",
    " 'year_built',\\\n",
    " 'floor_count',\\\n",
    " 'air_temperature',\\\n",
    " 'cloud_coverage',\\\n",
    " 'dew_temperature',\\\n",
    " 'precip_depth_1_hr',\\\n",
    " 'sea_level_pressure',\\\n",
    " 'wind_direction',\\\n",
    " 'wind_speed',\\\n",
    " 'relative_humidity',\\\n",
    " 'feels_like',\\\n",
    " 'hour']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = ['building_id', 'site_id', 'primary_use', 'meter','dayofweek']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostRegressor\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/pandas/core/frame.py:4259: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  **kwargs\n"
     ]
    }
   ],
   "source": [
    "params = {'n_estimators': 3000,\n",
    "          'learning_rate':0.05,\n",
    "          'depth':12,\n",
    "          'eval_metric': 'RMSE',\n",
    "          'loss_function': 'RMSE',\n",
    "          'early_stopping_rounds' : 50,\n",
    "          'random_state':42,\n",
    "          'metric_period': 100,\n",
    "          'task_type': 'GPU',\n",
    "          'boosting_type': 'Plain',\n",
    "          'gpu_cat_features_storage': 'CpuPinnedMemory'\n",
    "          }\n",
    "features.fillna(-999, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2640.82 MB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:24: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:22: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage after optimization is: 568.51 MB\n",
      "Decreased by 78.5%\n"
     ]
    }
   ],
   "source": [
    "features=reduce_mem_usage(features,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:\tlearn: 1.9036093\ttest: 1.8990542\tbest: 1.8990542 (0)\ttotal: 1.89s\tremaining: 1h 34m 20s\n",
      "100:\tlearn: 0.9049218\ttest: 0.8915325\tbest: 0.8915325 (100)\ttotal: 2m 57s\tremaining: 1h 24m 52s\n",
      "200:\tlearn: 0.8375390\ttest: 0.8283873\tbest: 0.8283873 (200)\ttotal: 6m 4s\tremaining: 1h 24m 33s\n",
      "300:\tlearn: 0.7997424\ttest: 0.7980601\tbest: 0.7980601 (300)\ttotal: 9m 19s\tremaining: 1h 23m 35s\n",
      "400:\tlearn: 0.7741627\ttest: 0.7801037\tbest: 0.7801037 (400)\ttotal: 12m 39s\tremaining: 1h 22m 1s\n",
      "500:\tlearn: 0.7557908\ttest: 0.7699491\tbest: 0.7699491 (500)\ttotal: 15m 55s\tremaining: 1h 19m 25s\n",
      "600:\tlearn: 0.7407948\ttest: 0.7629022\tbest: 0.7629022 (600)\ttotal: 19m 9s\tremaining: 1h 16m 28s\n",
      "700:\tlearn: 0.7289020\ttest: 0.7569599\tbest: 0.7569599 (700)\ttotal: 22m 24s\tremaining: 1h 13m 30s\n",
      "800:\tlearn: 0.7191050\ttest: 0.7520300\tbest: 0.7520300 (800)\ttotal: 25m 40s\tremaining: 1h 10m 29s\n",
      "900:\tlearn: 0.7102967\ttest: 0.7483493\tbest: 0.7483493 (900)\ttotal: 28m 54s\tremaining: 1h 7m 20s\n",
      "1000:\tlearn: 0.7028909\ttest: 0.7452059\tbest: 0.7452059 (1000)\ttotal: 32m 17s\tremaining: 1h 4m 28s\n",
      "1100:\tlearn: 0.6964344\ttest: 0.7430195\tbest: 0.7430182 (1098)\ttotal: 35m 34s\tremaining: 1h 1m 21s\n",
      "1200:\tlearn: 0.6905453\ttest: 0.7411140\tbest: 0.7411125 (1199)\ttotal: 38m 56s\tremaining: 58m 19s\n",
      "1300:\tlearn: 0.6851404\ttest: 0.7396186\tbest: 0.7396088 (1299)\ttotal: 42m 12s\tremaining: 55m 7s\n",
      "1400:\tlearn: 0.6803136\ttest: 0.7380675\tbest: 0.7380675 (1400)\ttotal: 45m 39s\tremaining: 52m 6s\n",
      "1500:\tlearn: 0.6759033\ttest: 0.7367521\tbest: 0.7367488 (1499)\ttotal: 49m 4s\tremaining: 49m\n",
      "1600:\tlearn: 0.6715040\ttest: 0.7358610\tbest: 0.7358485 (1577)\ttotal: 52m 28s\tremaining: 45m 50s\n",
      "1700:\tlearn: 0.6675508\ttest: 0.7345036\tbest: 0.7345036 (1700)\ttotal: 55m 58s\tremaining: 42m 45s\n",
      "1800:\tlearn: 0.6636245\ttest: 0.7339156\tbest: 0.7338975 (1794)\ttotal: 59m 25s\tremaining: 39m 34s\n",
      "1900:\tlearn: 0.6600931\ttest: 0.7329319\tbest: 0.7329205 (1898)\ttotal: 1h 2m 43s\tremaining: 36m 15s\n",
      "2000:\tlearn: 0.6566246\ttest: 0.7321396\tbest: 0.7321380 (1995)\ttotal: 1h 5m 59s\tremaining: 32m 56s\n",
      "2100:\tlearn: 0.6534734\ttest: 0.7314059\tbest: 0.7314029 (2098)\ttotal: 1h 9m 22s\tremaining: 29m 41s\n",
      "2200:\tlearn: 0.6500426\ttest: 0.7310362\tbest: 0.7310334 (2198)\ttotal: 1h 12m 50s\tremaining: 26m 26s\n",
      "2300:\tlearn: 0.6471525\ttest: 0.7305514\tbest: 0.7305319 (2297)\ttotal: 1h 16m 8s\tremaining: 23m 7s\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-14-33f079a95ebd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     21\u001b[0m     \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCatBoostRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m     \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_tr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_tr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_set\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcat_features\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcategorical_features\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     25\u001b[0m     \u001b[0my_pred_valid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/catboost/core.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, cat_features, sample_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)\u001b[0m\n\u001b[1;32m   4237\u001b[0m                          \u001b[0muse_best_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_set\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogging_level\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_description\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4238\u001b[0m                          \u001b[0mverbose_eval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetric_period\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msilent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mearly_stopping_rounds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4239\u001b[0;31m                          save_snapshot, snapshot_file, snapshot_interval, init_model)\n\u001b[0m\u001b[1;32m   4240\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4241\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mntree_start\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mntree_end\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthread_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/catboost/core.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, cat_features, text_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)\u001b[0m\n\u001b[1;32m   1688\u001b[0m                 \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1689\u001b[0m                 \u001b[0mallow_clear_pool\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1690\u001b[0;31m                 \u001b[0mtrain_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"init_model\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1691\u001b[0m             )\n\u001b[1;32m   1692\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/catboost/core.py\u001b[0m in \u001b[0;36m_train\u001b[0;34m(self, train_pool, test_pool, params, allow_clear_pool, init_model)\u001b[0m\n\u001b[1;32m   1223\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1224\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallow_clear_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minit_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1225\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_object\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallow_clear_pool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minit_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_object\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minit_model\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1226\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_trained_model_attributes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m_catboost.pyx\u001b[0m in \u001b[0;36m_catboost._CatBoost._train\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32m_catboost.pyx\u001b[0m in \u001b[0;36m_catboost._CatBoost._train\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# run kf = 5 not fully run, run by other servies without notebook saving\n",
    "from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold\n",
    "NFOLDS = 5\n",
    "columns = features.columns\n",
    "kf = StratifiedKFold(n_splits=5, shuffle=False, random_state=2319)\n",
    "splits=kf.split(features,train_df['month'])\n",
    "y_oof = np.zeros(features.shape[0])\n",
    "score = 0\n",
    "out_folder_train_prediction= pd.DataFrame()\n",
    "feature_importance_df = pd.DataFrame()\n",
    "\n",
    "models = []\n",
    "\n",
    "\n",
    "\n",
    "for fold_n, (train_index, valid_index) in enumerate(splits):\n",
    "    X_tr=features.iloc[train_index]\n",
    "    y_tr=target.iloc[train_index]\n",
    "    X_val=features.iloc[valid_index]\n",
    "    y_val=target.iloc[valid_index]\n",
    "    \n",
    "    model = CatBoostRegressor(**params)\n",
    "        \n",
    "    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), cat_features=categorical_features, verbose=True)\n",
    "\n",
    "    y_pred_valid = model.predict(X_val)\n",
    "    y_oof[valid_index] = y_pred_valid\n",
    "    print(f\"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_val, y_pred_valid))}\")\n",
    "    \n",
    "    score += np.sqrt(mean_squared_error(y_val, y_pred_valid)) / NFOLDS\n",
    "    \n",
    "    oof_preds=pd.DataFrame()\n",
    "    oof_preds['train_index']=valid_index\n",
    "    oof_preds['TARGET']= y_pred_valid\n",
    "    oof_preds[\"folder\"]=fold_n + 1\n",
    "    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)\n",
    "    \n",
    "    \n",
    "    fold_importance_df = pd.DataFrame()\n",
    "    fold_importance_df['feature']=columns\n",
    "    fold_importance_df['importance']=model.get_feature_importance()\n",
    "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "    \n",
    "    models.append(model)\n",
    "    del X_val,X_tr,y_val,y_tr\n",
    "    gc.collect()\n",
    "    \n",
    "print(f\"\\nMean rmse = {score}\")\n",
    "# print(f\"Out of folds rmse = {np.sqrt(mean_squared_error((target, y_oof)))}\")\n",
    "out_folder_train_prediction.to_csv('out_folder_train_prediction_cat_1.074.csv',index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = pd.read_csv('../Large_output/test_1.074.csv')\n",
    "X_test = test_df[['building_id',\\\n",
    "                 'site_id',\\\n",
    "                 'primary_use',\\\n",
    "                 'meter',\\\n",
    "                 'dayofweek',\\\n",
    "                 'square_feet',\\\n",
    "                 'year_built',\\\n",
    "                 'floor_count',\\\n",
    "                 'air_temperature',\\\n",
    "                 'cloud_coverage',\\\n",
    "                 'dew_temperature',\\\n",
    "                 'precip_depth_1_hr',\\\n",
    "                 'sea_level_pressure',\\\n",
    "                 'wind_direction',\\\n",
    "                 'wind_speed',\\\n",
    "                 'relative_humidity',\\\n",
    "                 'feels_like',\\\n",
    "                 'hour']]\n",
    "X_test.fillna(-999, inplace=True)\n",
    "X_test=reduce_mem_usage(X_test,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# separate the test df into 120 part to predict because of limitation of cpu\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "def predictions(models, iterations = 120):\n",
    "    # split test data into batches\n",
    "    set_size = len(test_df)\n",
    "    batch_size = set_size // iterations\n",
    "    meter_reading = []\n",
    "    for i in tqdm(range(iterations)):\n",
    "        pos = i*batch_size\n",
    "        fold_preds = [np.expm1(model.predict(test_df.iloc[pos : pos+batch_size])) for model in models]\n",
    "        meter_reading.extend(np.mean(fold_preds, axis=0))\n",
    "\n",
    "    print(len(meter_reading))\n",
    "    assert len(meter_reading) == set_size\n",
    "    test_df['meter_reading']=np.clip(meter_reading, 0, a_max=None)\n",
    "    test_df.loc[(test_df['site_id']==0) & (test_df['meter']==0),'meter_reading']=test_df.loc[(test_df['site_id']==0) &\\\n",
    "                                                                                             (test_df['meter']==0),'meter_reading'].mul(3.4118)\n",
    "    submission = pd.read_csv('../Resources/sample_submission.csv')\n",
    "    submission['meter_reading'] = test_df['meter_reading']\n",
    "    submission.to_csv('../Large_output/cat_new_csv_nomonth.csv', index=False)\n",
    "    print('We are done!')\n",
    "predictions(models)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {
     "00804b9a5db648a78fa0aa6267f140b3": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "0742e7e6f80348b5813405792ddcbac8": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.2.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "justify_items": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "object_fit": null,
       "object_position": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "0a225a18a8b5468bac1bdcf2402e7f76": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.5.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_0742e7e6f80348b5813405792ddcbac8",
       "placeholder": "​",
       "style": "IPY_MODEL_88b15454f8ee47f98a5bd7ce6ea6cfc9",
       "value": " 5/5 [1:21:44&lt;00:00, 980.94s/it]"
      }
     },
     "192cfc30717143f8853f861875044b4c": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.2.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "justify_items": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "object_fit": null,
       "object_position": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "40da3685dd7c440fbd11cf40a9836a67": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": "initial"
      }
     },
     "4996858666bc401d8323f34e81120c5c": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.2.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "justify_items": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "object_fit": null,
       "object_position": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "626f6d0255f44f78af96410c96b36c7c": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.5.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "100%",
       "description_tooltip": null,
       "layout": "IPY_MODEL_d5bf07ceac1c42f08eac707d31b2ba8a",
       "max": 5,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_40da3685dd7c440fbd11cf40a9836a67",
       "value": 5
      }
     },
     "640df1bbc3fd4e76aab98466599c8db0": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.2.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "justify_items": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "object_fit": null,
       "object_position": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "88b15454f8ee47f98a5bd7ce6ea6cfc9": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "8c81e5368fab49f085ad69a85ef4727e": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.5.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_626f6d0255f44f78af96410c96b36c7c",
        "IPY_MODEL_0a225a18a8b5468bac1bdcf2402e7f76"
       ],
       "layout": "IPY_MODEL_4996858666bc401d8323f34e81120c5c"
      }
     },
     "aa2357090f8b4bc58072237c7797bbf3": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.5.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_ff2b16baf05141a48711db2077f6c6e0",
        "IPY_MODEL_ad0bfc367b2741518849b6140a9efafd"
       ],
       "layout": "IPY_MODEL_192cfc30717143f8853f861875044b4c"
      }
     },
     "ad0bfc367b2741518849b6140a9efafd": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.5.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_e0753dc4920345c4b75724e815a2040e",
       "placeholder": "​",
       "style": "IPY_MODEL_00804b9a5db648a78fa0aa6267f140b3",
       "value": " 120/120 [3:07:14&lt;00:00, 93.62s/it]"
      }
     },
     "d5bf07ceac1c42f08eac707d31b2ba8a": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.2.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "justify_items": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "object_fit": null,
       "object_position": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "e0753dc4920345c4b75724e815a2040e": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.2.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "justify_items": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "object_fit": null,
       "object_position": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "eeaf84aa368347a0baf59c0ec3059bba": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.2.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": "initial"
      }
     },
     "ff2b16baf05141a48711db2077f6c6e0": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.5.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.5.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "100%",
       "description_tooltip": null,
       "layout": "IPY_MODEL_640df1bbc3fd4e76aab98466599c8db0",
       "max": 120,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_eeaf84aa368347a0baf59c0ec3059bba",
       "value": 120
      }
     }
    },
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
